Grab Dependencies

In [11]:
library(rvest)
library(dplyr)
library(ggplot2)
library(tidyr)
library(corrplot)
library(ape)

mod<-function(x,m){
  t1<-floor(x/m)
  return(x-t1*m)
}

Pull table from webpage

In [2]:
#read in webpage
url<-"https://bulbapedia.bulbagarden.net/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number#Generation_III"
pg<-read_html(url)
table_nodes<-pg %>% html_nodes("table")
#only read table nodes 2 through 8 to get list of all pokemon
table_nodes<-table_nodes[2:8]
html_table(table_nodes[1],fill=TRUE)[[1]]
A data.frame: 170 × 6
KdexNdexMSPokémonTypeType
<chr><chr><lgl><chr><chr><chr>
#001#001NABulbasaur Grass Poison
#002#002NAIvysaur Grass Poison
#003#003NAVenusaur Grass Poison
#004#004NACharmanderFire Fire
#005#005NACharmeleonFire Fire
#006#006NACharizard Fire Flying
#007#007NASquirtle Water Water
#008#008NAWartortle Water Water
#009#009NABlastoise Water Water
#010#010NACaterpie Bug Bug
#011#011NAMetapod Bug Bug
#012#012NAButterfreeBug Flying
#013#013NAWeedle Bug Poison
#014#014NAKakuna Bug Poison
#015#015NABeedrill Bug Poison
#016#016NAPidgey Normal Flying
#017#017NAPidgeotto Normal Flying
#018#018NAPidgeot Normal Flying
#019#019NARattata Normal Normal
#019NARattata Dark Normal
#020#020NARaticate Normal Normal
#020NARaticate Dark Normal
#021#021NASpearow Normal Flying
#022#022NAFearow Normal Flying
#023#023NAEkans Poison Poison
#024#024NAArbok Poison Poison
#025#025NAPikachu ElectricElectric
#026#026NARaichu ElectricElectric
#026NARaichu ElectricPsychic
#027#027NASandshrew Ground Ground
#122#122NAMr. Mime Psychic Fairy
#123#123NAScyther Bug Flying
#124#124NAJynx Ice Psychic
#125#125NAElectabuzzElectricElectric
#126#126NAMagmar Fire Fire
#127#127NAPinsir Bug Bug
#128#128NATauros Normal Normal
#129#129NAMagikarp Water Water
#130#130NAGyarados Water Flying
#131#131NALapras Water Ice
#132#132NADitto Normal Normal
#133#133NAEevee Normal Normal
#134#134NAVaporeon Water Water
#135#135NAJolteon ElectricElectric
#136#136NAFlareon Fire Fire
#137#137NAPorygon Normal Normal
#138#138NAOmanyte Rock Water
#139#139NAOmastar Rock Water
#140#140NAKabuto Rock Water
#141#141NAKabutops Rock Water
#142#142NAAerodactylRock Flying
#143#143NASnorlax Normal Normal
#144#144NAArticuno Ice Flying
#145#145NAZapdos ElectricFlying
#146#146NAMoltres Fire Flying
#147#147NADratini Dragon Dragon
#148#148NADragonair Dragon Dragon
#149#149NADragonite Dragon Flying
#150#150NAMewtwo Psychic Psychic
#151#151NAMew Psychic Psychic

Create Pokedex

In [4]:
create_frames<-function(node){
  #get structure
  structure = html_table(node,fill=TRUE)[[1]]
  df = data.frame(structure[,c(2,4,5,6)])
  return(df)
}

#loop over table nodes to create complete table of pokemone
region_names<-c("Kanto", "Johto", "Hoenn", "Sinnoh", "Unova", "Kalos", "Alola")
list_frames<- list()
for (i in 1:length(table_nodes)){
  df<-create_frames(table_nodes[i])
  #create region list
  dim1<- dim(df)[1]
  region<- rep(region_names[i],dim1)
  df$Region<-region
  list_frames[[i]]<-df
}

#create pokedex, but need to clean up
pokedex<-do.call(rbind,list_frames)
pokedex_names<-c('Ndex','Name','Primary_Type','Secondary_Type','Region')
colnames(pokedex)<-pokedex_names
pokedex<- pokedex %>%
  arrange(Name)
#drop duplicate entries
pokedex<-pokedex[!duplicated(pokedex[c("Name")]),]
In [5]:
pokedex
A data.frame: 809 × 5
NdexNamePrimary_TypeSecondary_TypeRegion
<chr><chr><chr><chr><chr>
1#460Abomasnow Grass Ice Sinnoh
2#063Abra Psychic Psychic Kanto
3#359Absol Dark Dark Hoenn
4#617Accelgor Bug Bug Unova
5#681Aegislash Steel Ghost Kalos
6#142AerodactylRock Flying Kanto
7#306Aggron Steel Rock Hoenn
8#190Aipom Normal Normal Johto
9#065Alakazam Psychic Psychic Kanto
10#594Alomomola Water Water Unova
11#334Altaria Dragon Flying Hoenn
12#698Amaura Rock Ice Kalos
13#424Ambipom Normal Normal Sinnoh
14#591Amoonguss Grass Poison Unova
15#181Ampharos ElectricElectricJohto
16#347Anorith Rock Bug Hoenn
17#752Araquanid Water Bug Alola
18#024Arbok Poison Poison Kanto
19#059Arcanine Fire Fire Kanto
20#493Arceus Normal Normal Sinnoh
21#566Archen Rock Flying Unova
22#567Archeops Rock Flying Unova
23#168Ariados Bug Poison Johto
24#348Armaldo Rock Bug Hoenn
25#683AromatisseFairy Fairy Kalos
26#304Aron Steel Rock Hoenn
27#144Articuno Ice Flying Kanto
28#531Audino Normal Normal Unova
29#699Aurorus Rock Ice Kalos
30#713Avalugg Ice Ice Kalos
827#293Whismur Normal Normal Hoenn
828#040WigglytuffNormal Fairy Kanto
829#767Wimpod Bug Water Alola
830#278Wingull Water Flying Hoenn
831#746WishiwashiWater Water Alola
832#202Wobbuffet Psychic Psychic Johto
833#527Woobat Psychic Flying Unova
834#194Wooper Water Ground Johto
835#413Wormadam Bug Grass Sinnoh
838#265Wurmple Bug Bug Hoenn
839#360Wynaut Psychic Psychic Hoenn
840#178Xatu Psychic Flying Johto
841#716Xerneas Fairy Fairy Kalos
842#796Xurkitree ElectricElectricAlola
843#562Yamask Ghost Ghost Unova
844#193Yanma Bug Flying Johto
845#469Yanmega Bug Flying Sinnoh
846#734Yungoos Normal Normal Alola
847#717Yveltal Dark Flying Kalos
848#335Zangoose Normal Normal Hoenn
849#145Zapdos ElectricFlying Kanto
850#523Zebstrika ElectricElectricUnova
851#644Zekrom Dragon ElectricUnova
852#807Zeraora ElectricElectricAlola
853#263Zigzagoon Normal Normal Hoenn
855#571Zoroark Dark Dark Unova
856#570Zorua Dark Dark Unova
857#041Zubat Poison Flying Kanto
858#634Zweilous Dark Dragon Unova
859#718Zygarde Dragon Ground Kalos

Scraping each page for pokemon stats

For each pokemon page, pull out the necessary tables

In [9]:
##################
#for each name in the pokedex go to its page and grab the stats and move tables
#this takes a while to run
pokemon_names<-pokedex$Name
summary_tables<-list()

for (i in 1:length(pokemon_names)){
  #case when two words
  if (pokemon_names[i] == "Nidoran♂"){
    pokemon_names[i] = "nidoran-m"
  }
  if (pokemon_names[i] == "Nidoran♀"){
    pokemon_names[i] = "nidoran-f"
  }
  if (pokemon_names[i] == "Mr. Mime"){
    pokemon_names[i] = "mr-mime"
  }
  if (pokemon_names[i] == "Mime Jr."){
    pokemon_names[i] = "mime-jr"
  }
  if (pokemon_names[i] == "Type: Null"){
    pokemon_names[i] = "type-null"
  }
  if (pokemon_names[i] == "Flabébé"){
    pokemon_names[i] = "flabebe"
  }
  if (pokemon_names[i] == "Tapu Koko"){
    pokemon_names[i] = "Tapu-Koko"
  }
  if (pokemon_names[i] == "Tapu Lele"){
    pokemon_names[i] = "Tapu-LeLe"
  }
  if (pokemon_names[i] == "Tapu Bulu"){
    pokemon_names[i] = "Tapu-Bulu"
  }
  if (pokemon_names[i] == "Tapu Fini"){
    pokemon_names[i] = "Tapu-Fini"
  }
  #makelowercase
  lowercase<-tolower(pokemon_names[i])
  #create url
  url<-paste0("https://pokemondb.net/pokedex/",lowercase)
  #get table nodes
  table_nodes<-read_html(url) %>%
    html_nodes("table")
  #use the fourth node
  table<-html_table(table_nodes[4])[[1]]
  #add pokemon name as column
  dim<-dim(table)[1]
  name<-rep(pokemon_names[i],dim)
  table$Name<-name
  #add to list summary tables
  summary_tables[[i]]<-table
  Sys.sleep(0.4)
  if (mod(i,50) == 0){
    print(i)
    print(Sys.time())
  }
}
[1] 50
[1] "2019-10-22 12:54:38 EDT"
[1] 100
[1] "2019-10-22 12:55:05 EDT"
[1] 150
[1] "2019-10-22 12:55:31 EDT"
[1] 200
[1] "2019-10-22 12:55:58 EDT"
[1] 250
[1] "2019-10-22 12:56:24 EDT"
[1] 300
[1] "2019-10-22 12:56:52 EDT"
[1] 350
[1] "2019-10-22 12:57:18 EDT"
[1] 400
[1] "2019-10-22 12:57:44 EDT"
[1] 450
[1] "2019-10-22 12:58:10 EDT"
[1] 500
[1] "2019-10-22 12:58:36 EDT"
[1] 550
[1] "2019-10-22 12:59:02 EDT"
[1] 600
[1] "2019-10-22 12:59:28 EDT"
[1] 650
[1] "2019-10-22 12:59:54 EDT"
[1] 700
[1] "2019-10-22 13:00:21 EDT"
[1] 750
[1] "2019-10-22 13:00:47 EDT"
[1] 800
[1] "2019-10-22 13:01:14 EDT"
In [10]:
#need to clean
pokemon_stats<-do.call(rbind,summary_tables)

#create base states df
base_stats<-pokemon_stats[,c("X1","X2","Name")]
base_stats<-spread(base_stats, X1,X2)
base_stats$Name<-pokedex$Name
base_names<-c("Name","Attack","Defense","HP","Sp_Attack","Sp_Defense","Speed","Total")
colnames(base_stats)<-base_names


#create min stats frame
min_stats<-pokemon_stats[,c("X1","X4","Name")]
min_stats<-spread(min_stats,X1,X4)
min_stats<-subset(min_stats,select=-c(Total))
min_names<-c("Name", "Min_Attack", "Min_Defense","Min_HP", 
             "Min_Sp_Attack","Min_Sp_Defense", "Min_Speed")
colnames(min_stats)<-min_names
min_stats$Name<-pokedex$Name

#create max stats
max_stats<-pokemon_stats[,c("X1","X5","Name")]
max_stats<-spread(max_stats,X1,X5)
max_stats<-subset(max_stats,select=-c(Total))
max_names<-c("Name", "Max_Attack", "Max_Defense","Max_HP", 
             "Max_Sp_Attack","Max_Sp_Defense", "Max_Speed")
colnames(max_stats)<-max_names
max_stats$Name<-pokedex$Name

#merge all 4 together
a<-merge(pokedex,base_stats,by='Name')
b<-merge(a,min_stats,by='Name')
final<-merge(b,max_stats,by='Name')

#adjust dtype
final$Min_Attack<-as.integer(final$Min_Attack)
final$Min_Defense<-as.integer(final$Min_Defense)
final$Min_HP<-as.integer(final$Min_HP)
final$Min_Sp_Attack<-as.integer(final$Min_Sp_Attack)
final$Min_Sp_Defense<-as.integer(final$Min_Sp_Defense)
final$Min_Speed<-as.integer(final$Min_Speed)

final$Max_Attack<-as.integer(final$Max_Attack)
final$Max_Defense<-as.integer(final$Max_Defense)
final$Max_HP<-as.integer(final$Max_HP)
final$Max_Sp_Attack<-as.integer(final$Max_Sp_Attack)
final$Max_Sp_Defense<-as.integer(final$Max_Sp_Defense)
final$Max_Speed<-as.integer(final$Max_Speed)
In [4]:
dir<-"/Users/janmichaelaustria/Google Drive/UNH Fall Expansion/UNH Fall 2019/DATA900 Webscraping"
setwd(dir)
write.csv(final,"/Users/janmichaelaustria/Google Drive/UNH Fall Expansion/UNH Fall 2019/DATA900 Webscraping/pokemon.csv", row.names = FALSE)
Error in is.data.frame(x): object 'final' not found
Traceback:

1. write.csv(final, "/Users/janmichaelaustria/Google Drive/UNH Fall Expansion/UNH Fall 2019/DATA900 Webscraping/pokemon.csv", 
 .     row.names = FALSE)
2. eval.parent(Call)
3. eval(expr, p)
4. eval(expr, p)
5. write.table(final, "/Users/janmichaelaustria/Google Drive/UNH Fall Expansion/UNH Fall 2019/DATA900 Webscraping/pokemon.csv", 
 .     row.names = FALSE, col.names = TRUE, sep = ",", dec = ".", 
 .     qmethod = "double")
6. is.data.frame(x)
In [2]:
#import final
dir<-"/Users/janmichaelaustria/Google Drive/UNH Fall Expansion/UNH Fall 2019/DATA900 Webscraping"
setwd(dir)
final <- read.csv(file="pokemon.csv", header=TRUE, sep=",")
In [252]:
colnames(final)[6:24]
  1. 'Attack'
  2. 'Defense'
  3. 'HP'
  4. 'Sp_Attack'
  5. 'Sp_Defense'
  6. 'Speed'
  7. 'Total'
  8. 'Min_Attack'
  9. 'Min_Defense'
  10. 'Min_HP'
  11. 'Min_Sp_Attack'
  12. 'Min_Sp_Defense'
  13. 'Min_Speed'
  14. 'Max_Attack'
  15. 'Max_Defense'
  16. 'Max_HP'
  17. 'Max_Sp_Attack'
  18. 'Max_Sp_Defense'
  19. 'Max_Speed'

ARE POKEMON STRONGER IN A ONE GENERATION?

In [337]:
options(repr.plot.width=4, repr.plot.height=4)
final %>%
    group_by(Region) %>%
    select(c(colnames(final)[6:24])) %>%
    summarise_all(funs(mean)) %>%
    ggplot(aes(x=Region, y=Attack,fill=Region)) +
    geom_bar(stat="identity")
Adding missing grouping variables: `Region`
In [238]:
options(repr.plot.width=14, repr.plot.height=7)
plot_multi_histogram <- function(df, feature, label_column) {
  plt <- ggplot(df, aes(x=eval(parse(text=feature)), fill=eval(parse(text=label_column)))) +
    geom_histogram(alpha=0.7, position="identity", aes(y = ..density..), color="black") +
    geom_density(alpha=0.4) +
    geom_vline(aes(xintercept=mean(eval(parse(text=feature)))), color="black", linetype="dashed", size=1) +
    labs(x=feature, y = "Density")
  plt + guides(fill=guide_legend(title=label_column))
}
plot_multi_histogram(final,"Total","Primary_Type") + facet_wrap( ~ Region, ncol=3)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Attack by Defense

In [261]:
options(repr.plot.width=11, repr.plot.height=11)
ggplot(final,aes(x=Defense, y=Attack)) +
    geom_point(aes(colour = factor(Primary_Type),size=Total),alpha=0.6) +
    geom_text(data=subset(final, Attack > 130 | Defense > 130),aes(label=Name)) +
    geom_text(data=subset(final, HP > 210),aes(label=Name)) +
    geom_smooth(method = 'lm',se = TRUE,color='red')
In [110]:
options(repr.plot.width=20, repr.plot.height=50)
options(warn=-1)
ggplot(final, aes(y=as.integer(Attack), x=factor(Primary_Type), fill=factor(Primary_Type))) + 
  geom_boxplot() + 
    facet_wrap( ~ Region, ncol=1) + 
    geom_text(data=subset(final, Attack > 100 | Attack < 50),aes(label=Name),size = 7)  
    #coord_flip()

For the first and second generation

In [131]:
options(repr.plot.width=30, repr.plot.height=70)
options(warn=-1)
final %>%
    filter(Region %in% c("Kanto",'Johto')) %>%
    ggplot(aes(x=factor(Primary_Type), y = as.integer(Attack),fill=factor(Region))) +
    geom_boxplot() + 
    coord_flip() +
    facet_wrap(~Region,ncol = 2) +
    geom_text(aes(label=Name,colour=Primary_Type,size=Attack),fontface = "bold",
              position=position_jitter(width=.7,height=.5))+
    scale_fill_manual(values = c("Kanto" = "grey",
                               "Johto" = "white"))+
    scale_size(range = c(5, 12)) +
    theme(axis.text.x= element_text(size =40)) +
    theme(axis.text.y= element_text(size =40)) +
    theme(axis.title.x = element_text(size = 40),
         axis.title.y = element_text(size = 40)) +
    theme(strip.text.x = element_text(size = 40))
In [99]:
options(repr.plot.width=7, repr.plot.height=7)
num.cols <- sapply(final, is.numeric)
cor.data <- cor(final[,num.cols])
corrPLOT<-corrplot(cor.data,method='ellipse')

Clustering, H_Clust,Generation 1 and 2



$\|a-b\|_{2} = \sqrt{\sum_{i=1}^{n} (a_i - b_i)^2}$

Complete Linkage:

$\operatorname{argmax}{d(a,b): a \epsilon A, b \epsilon B} $

In [100]:
options(repr.plot.width=20, repr.plot.height=20)
options(warn=-1)
final_K_J<- final %>%
    filter(Region %in% c("Kanto","Johto"))
numeric_columns<-subset(final_K_J, select = -c(Name,Ndex,
                                           Primary_Type,
                                           Secondary_Type,
                                           Region))
hc.complete=hclust(dist(numeric_columns), method="complete")
hc.complete$labels<-final_K_J$Name
hcd <- as.dendrogram(hc.complete)
hcd_phylo<-as.phylo(hc.complete)
hcd_phylo$tip.label<-as.character(final_K_J$Name)
colors = c("red", "blue", "green", "black","pink","magenta")
clus6 = cutree(hc.complete, 6)
plot(hcd_phylo, type = "fan", tip.color = colors[clus6],
     label.offset = 1, cex = 0.7)

PCA and HDBSCAN

In [90]:
numeric_columns<-subset(final, select = -c(Name,Ndex,
                                           Primary_Type,
                                           Secondary_Type,
                                           Region))
pc_scores<-pr.out$x
pr.out<-prcomp(numeric_columns, scale=TRUE)

#add first and second to PC scores to final
final$PC1<-pc_scores[,1]
final$PC2<-pc_scores[,2]
In [206]:
#creat arrors for featurs in frist two PCS
pc_arrows<-pr.out$rotation
pc_arrows<-pc_arrows[,c(1,2)]
In [207]:
options(repr.plot.width=10, repr.plot.height=7)
#Bi plots
p<-final %>%
  ggplot(aes(x=PC1,y=PC2)) +
  geom_point(aes(colour=Region,size=Attack),alpha=.7) +
  geom_text(data=subset(final, Attack > 130 | Defense > 180),aes(label=Name)) 
  #geom_text(aes(label=Name))
p +   geom_segment(mapping=aes(x=0, y=0, xend=pc_arrows[1,1]*40, yend=pc_arrows[1,2]*40), 
               arrow=arrow(), size=1, color="red",alpha=0.5) +
    geom_text(x=pc_arrows[1,1]*40, y=pc_arrows[1,2]*40, label=rownames(pc_arrows)[1], size=5,alpha=.2) +
    geom_segment(mapping=aes(x=0, y=0, xend=pc_arrows[2,1]*40, yend=pc_arrows[2,2]*40), 
               arrow=arrow(), size=1, color="red",alpha=0.5) +
    geom_text(x=pc_arrows[2,1]*40, y=pc_arrows[2,2]*40, label=rownames(pc_arrows)[2], size=5,alpha=.2) +
    geom_segment(mapping=aes(x=0, y=0, xend=pc_arrows[3,1]*40, yend=pc_arrows[3,2]*40), 
               arrow=arrow(), size=1, color="red",alpha=0.5) +
    geom_text(x=pc_arrows[3,1]*40, y=pc_arrows[3,2]*40, label=rownames(pc_arrows)[3], size=5,alpha=.2) +
    geom_segment(mapping=aes(x=0, y=0, xend=pc_arrows[6,1]*40, yend=pc_arrows[6,2]*40), 
               arrow=arrow(), size=1, color="red",alpha=0.5) +
    geom_text(x=pc_arrows[6,1]*40, y=pc_arrows[6,2]*40, label=rownames(pc_arrows)[6], size=5,alpha=.2)

Looks like there are more dense areas than others

In [197]:
options(repr.plot.width=10, repr.plot.height=7)
#denstiy biplots
final%>%
  ggplot(aes(x=PC1, y=PC2) ) +
  geom_hex(bins=30) +
  scale_fill_continuous(type = "viridis",alpha=.8) +
  theme_bw()

HDBSCAN Attempt

In [199]:
#lets try HDBSCAN, onlny on the first two PCS
options(warn=-1)
library(dbscan)
cl_obj<-hdbscan(subset(final,select = c(PC1,PC2)),minPts = 3)
plot(cl_obj$hc)
#put the labels back in
final$hdbscan_labels<-cl_obj$cluster
In [208]:
#put the labels back in
final$hdbscan_labels<-cl_obj$cluster
final %>%
  ggplot(aes(x=PC1,y=PC2)) +
  geom_point(aes(size=Attack,colour=factor(hdbscan_labels)),alpha=0.7) +
  geom_text(data=subset(final, Attack > 135 | Defense > 180),aes(label=Name))
In [456]:
options(repr.plot.width=20, repr.plot.height=12)
final %>%
  #filter(Region=='Sinnoh') %>%
  ggplot(aes(x=factor(hdbscan_labels),y=Attack,)) +
  geom_boxplot() +
  geom_text(data=subset(final, Attack > 135 | Defense > 180),aes(label=Name,colour=Primary_Type,size=Attack),fontface = "bold",
              position=position_jitter(width=.7,height=.5)) +
  coord_flip()

Can we cluster yet again on the larger group?

In [229]:
#now try only clustering group 5
final_cluster_5<-final[final$hdbscan_labels == 5,]

final_cluster_5 %>%
  ggplot(aes(x=PC1,PC2)) +
  geom_point(aes(colour=Region,size=Attack),alpha=0.7) +
  geom_text(data=subset(final_cluster_5, Attack < 150 & Attack > 120),aes(label=Name))
In [289]:
cl_obj_5<-hdbscan(subset(final_cluster_5,select = c(PC1,PC2)),minPts =10)
final_cluster_5$hdbscan_labels<-cl_obj_5$cluster

#check clusters
final_cluster_5 %>%
  ggplot(aes(x=PC1,PC2)) +
  geom_point(aes(colour=factor(hdbscan_labels),size=Total),alpha=0.7)

Cluster Distributions

In [319]:
options(repr.plot.width=15, repr.plot.height=10)
final_cluster_5 %>%
    filter(hdbscan_labels != 0) %>%
    ggplot(aes(y=as.integer(Attack), x=factor(hdbscan_labels),fill=factor(hdbscan_labels))) + 
    geom_boxplot() +
    geom_text(data=subset(final_cluster_5,Attack > 120),aes(label=Name)) +
    geom_text(data=subset(final_cluster_5,Attack < 100 & Attack > 70),aes(label=Name))

Machine Learning Model: Can we predict what generation a poekmon is from?

Lets try a linear kernel SVM

In [358]:
#the summary stats indicate the avg stats by region are different than one another
#if we tie the avg stats to each pokemon stats maybe the classification by region will be better
library(e1071)
X<-pr.out$x
y<-final$Region
dat<-data.frame(x=X,y=as.factor(y))
svmfit=svm(y~., data=dat, kernel="linear", cost=0.1, scale=FALSE)
set.seed (1)
tune.out<-tune(svm,y~.,data=dat,kernel="linear",ranges =list(cost=c(0.001, 0.01, 0.1, 1,5,10,100)))
summary(tune.out)
Parameter tuning of ‘svm’:

- sampling method: 10-fold cross validation 

- best parameters:
 cost
    1

- best performance: 0.7317593 

- Detailed performance results:
   cost     error dispersion
1 1e-03 0.8256944 0.01985107
2 1e-02 0.7590741 0.05525299
3 1e-01 0.7392438 0.04057165
4 1e+00 0.7317593 0.03033373
5 5e+00 0.7429012 0.03931199
6 1e+01 0.7416512 0.03925989
7 1e+02 0.7428858 0.04192020
In [361]:
best_mod_svm_lin<-tune.out$best.model
svm_lin_accuracy<-0.7317593

Lets try and rbf_svm

In [362]:
svmfit=svm(y~., data=dat, kernel="linear", cost=0.1, scale=FALSE)
set.seed (1)
tune.out<-tune(svm,y~.,data=dat,kernel="radial",
               ranges =list(cost=c(0.001, 0.01, 0.1, 1,5,10,100)),
              gamma=c(0.5,1,2,3,4))
summary(tune.out)
Parameter tuning of ‘svm’:

- sampling method: 10-fold cross validation 

- best parameters:
 cost
    1

- best performance: 0.6971605 

- Detailed performance results:
   cost     error dispersion
1 1e-03 0.8244599 0.01927211
2 1e-02 0.8244599 0.01927211
3 1e-01 0.8244599 0.01927211
4 1e+00 0.6971605 0.05735997
5 5e+00 0.6996759 0.03869834
6 1e+01 0.7021296 0.04428662
7 1e+02 0.7021296 0.03943174
In [363]:
svm_rbf_accuracy<-0.6971605

Let's try logistic regression

In [364]:
library(tidyverse)
library(caret)
library(nnet)
── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
 tibble  2.1.3      purrr   0.3.2
 readr   1.3.1      stringr 1.4.0
 tibble  2.1.3      forcats 0.4.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
 dplyr::filter()         masks stats::filter()
 readr::guess_encoding() masks rvest::guess_encoding()
 dplyr::lag()            masks stats::lag()
 purrr::pluck()          masks rvest::pluck()
Loading required package: lattice

Attaching package: ‘caret’

The following object is masked from ‘package:purrr’:

    lift

In [441]:
log_reg <- multinom(y~., data = data_frame(X))
# weights:  147 (120 variable)
initial  value 1574.241311 
iter  10 value 1524.414645
iter  20 value 1515.855981
iter  30 value 1513.930299
iter  40 value 1512.438410
iter  50 value 1443.898997
iter  60 value 1411.849104
iter  70 value 1399.150504
iter  80 value 1393.617706
iter  90 value 1390.100124
iter 100 value 1388.995223
final  value 1388.995223 
stopped after 100 iterations
In [442]:
predicted.classes <- log_reg %>% predict(data_frame(X))
log_reg_acc<-mean(predicted.classes == y)
log_reg_acc
0.312731767614339

Let's try a Random Forest

In [390]:
library(randomForest)
set.seed(1)
bag.model<-randomForest(y~.,data=X,mtry=6,importance =TRUE,ntree=55)
bag.model
Call:
 randomForest(formula = y ~ ., data = X, mtry = 6, importance = TRUE,      ntree = 55) 
               Type of random forest: classification
                     Number of trees: 55
No. of variables tried at each split: 6

        OOB estimate of  error rate: 58.96%
Confusion matrix:
       Alola Hoenn Johto Kalos Kanto Sinnoh Unova class.error
Alola     18    15     7    11    12      6    19   0.7954545
Hoenn      9    58    22     1    20     15    10   0.5703704
Johto      5    22    21     4    25     11    12   0.7900000
Kalos      8     6     3    35     2      4    14   0.5138889
Kanto      6    27    15     1    74      9    19   0.5099338
Sinnoh     9    11    15     8     9     42    13   0.6074766
Unova      7    18    10     8    20      9    84   0.4615385
In [391]:
random_forest_accuracy<-0.5896

Now finally, lets try boosting, its classification so adaboost

In [402]:
library(gbm)
set.seed(1)
boost.model<-gbm(y~.,data=data_frame(X),distribution= "multinomial",
                 n.trees=5000, interaction.depth=4,shrinkage =0.2, verbose=F)
boost.model
gbm(formula = y ~ ., distribution = "multinomial", data = data_frame(X), 
    n.trees = 5000, interaction.depth = 4, shrinkage = 0.2, verbose = F)
A gradient boosted model with multinomial loss function.
5000 iterations were performed.
There were 1 predictors of which 1 had non-zero influence.
In [427]:
predictions_boosted<-predict(boost.model,newdata = data_frame(X),n.trees=5000,type="response")
p.predBST <- apply(predictions_boosted, 1, which.max)
In [429]:
for (i in 1:length(y)){
    if (p.predBST[i] == 1){
        p.predBST[i] = 'Alola'
    }
    if (p.predBST[i] == 2){
        p.predBST[i] = "Hoenn"
    }
    if (p.predBST[i] == 3){
        p.predBST[i] = "Johto"
    }
    if (p.predBST[i] == 4){
        p.predBST[i] = "Kalos"
    }
    if (p.predBST[i] == 5){
        p.predBST[i] = "Kanto"
    }
    if (p.predBST[i] == 6){
        p.predBST[i] = "Sinnoh"
    }
    if (p.predBST[i] == 7){
        p.predBST[i] = "Unova"
    }
}
In [437]:
mean(y == factor(p.predBST))
0.974042027194067
In [438]:
gradient_boosted_tree_accuracy<-mean(y == factor(p.predBST))

Summary of Models

In [451]:
svm_lin_accuracy<-0.7317
svm_rbf_accuracy<-0.6971
log_reg_accuracy<-0.3127
random_forest_accuracy<-0.5896
grad_boost_accuracy<-0.9740
values<-rbind(0.7317,0.6971,0.3127,0.5896,0.9740)
names<-c('svm_lin_accuracy','svm_rbf_accuracy','log_reg_accuracy',
        'random_forest_accuracy','grad_boost_accuracy')
summary<-cbind(names,values)
(data_frame(x=values,y=names))
A tibble: 5 × 2
xy
<dbl[,1]><chr>
0.7317svm_lin_accuracy
0.6971svm_rbf_accuracy
0.3127log_reg_accuracy
0.5896random_forest_accuracy
0.9740grad_boost_accuracy
In [1]:
!jupyter nbconvert --to html Webscraping_Pokemon.ipynb
Error in parse(text = x, srcfile = src): <text>:1:10: unexpected symbol
1: !jupyter nbconvert
             ^
Traceback:
In [ ]: